# orig data missing???
SUB <- read_csv("data-raw/geo/suburb-and-adjoining-suburb-november-2019.zip") %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
select(suburb_name) %>%
rename(SSC_NAME16 = suburb_name) %>%
distinct() %>%
mutate(SSC_NAME16 = str_to_title(SSC_NAME16)) %>%
arrange(SSC_NAME16)
write_rds(SUB, "data/geo/clean/SUB.Rds")Full ( hopefully ;) list of Brisbane suburbs. Top 5 alphabetically:
SUB <- read_rds("data/geo/SUB.Rds")
# glimpse(SUB)
SUB %>%
slice(1:5) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| SSC_NAME16 |
|---|
| Acacia Ridge |
| Albion |
| Alderley |
| Algester |
| Annerley |
This might very well include areas with no pops (and therefore no SEIFA), for instance:
SUB %>%
filter(str_detect(SSC_NAME16,
regex("port", ignore_case = TRUE))) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| SSC_NAME16 |
|---|
| Brisbane Airport |
| Port Of Brisbane |
Names from Brisbane containing (Brisbane - Qld), names from Qld containing (Qld) have to be cleaned to match BCC data.
Mcdowall is called McDowall and Mount Coot-tha is Mount Coot-Tha - these have been unifeied as well.
unzip("data-raw/geo/1270055003_ssc_2016_aust_shape.zip",
exdir = "data-raw/geo")
SSC <- st_read("data-raw/geo/1270055003_ssc_2016_aust_shape/SSC_2016_AUST.shp",
stringsAsFactors = FALSE) %>%
mutate(SSC_CODE16 = as.integer(SSC_CODE16)) %>%
select(-STE_NAME16, -STE_CODE16, -AREASQKM16) %>%
st_transform(3112) %>%
filter(!st_is_empty(geometry)) %>%
mutate(SSC_NAME16_orig = SSC_NAME16) %>%
mutate(SSC_NAME16 = str_remove(SSC_NAME16,
fixed(" (Brisbane - Qld)"))) %>%
mutate(SSC_NAME16 = str_remove(SSC_NAME16,
fixed(" (Qld)"))) %>%
mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "McDowall",
"Mcdowall", SSC_NAME16)) %>%
mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "Mount Coot-tha",
"Mount Coot-Tha", SSC_NAME16))## Reading layer `SSC_2016_AUST' from data source `C:\external\FUN_BCC-animals\data-raw\geo\1270055003_ssc_2016_aust_shape\SSC_2016_AUST.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3264 features and 5 fields (with 2 geometries empty)
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 137.9943 ymin: -29.1779 xmax: 153.5522 ymax: -9.142176
## Geodetic CRS: GDA94
# SSC <- ms_simplify(SSC, keep = 0.05, weighting = 0.7) # default settings
write_rds(SSC, "data/geo/SSC_2016_AUST.Rds")
unlink("data-raw/geo/1270055003_ssc_2016_aust_shape", recursive = TRUE)Areas without matches using original names
SUB %>%
left_join(SSC) %>%
select(-geometry) %>%
filter(SSC_NAME16 != SSC_NAME16_orig) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| SSC_NAME16 | SSC_CODE16 | SSC_NAME16_orig |
|---|---|---|
| Albion | 30024 | Albion (Brisbane - Qld) |
| Ascot | 30089 | Ascot (Brisbane - Qld) |
| Bald Hills | 30123 | Bald Hills (Qld) |
| Balmoral | 30132 | Balmoral (Qld) |
| Belmont | 30226 | Belmont (Qld) |
| Brighton | 30375 | Brighton (Qld) |
| Brookfield | 30388 | Brookfield (Qld) |
| Carina | 30542 | Carina (Qld) |
| Chandler | 30583 | Chandler (Qld) |
| Chapel Hill | 30584 | Chapel Hill (Qld) |
| Durack | 30913 | Durack (Qld) |
| Fairfield | 31025 | Fairfield (Qld) |
| Grange | 31236 | Grange (Qld) |
| Hamilton | 31301 | Hamilton (Qld) |
| Jindalee | 31459 | Jindalee (Qld) |
| Kangaroo Point | 31496 | Kangaroo Point (Qld) |
| Kooringal | 31586 | Kooringal (Qld) |
| Larapinta | 31654 | Larapinta (Qld) |
| Macgregor | 31736 | Macgregor (Qld) |
| Mackenzie | 31741 | Mackenzie (Brisbane - Qld) |
| Manly | 31764 | Manly (Qld) |
| Mansfield | 31768 | Mansfield (Qld) |
| Mcdowall | 31806 | McDowall |
| Middle Park | 31839 | Middle Park (Qld) |
| Milton | 31864 | Milton (Qld) |
| Mount Coot-Tha | 31969 | Mount Coot-tha |
| Newstead | 32156 | Newstead (Qld) |
| Northgate | 32207 | Northgate (Qld) |
| Oxley | 32262 | Oxley (Qld) |
| Paddington | 32269 | Paddington (Qld) |
| Red Hill | 32424 | Red Hill (Brisbane - Qld) |
| Richlands | 32447 | Richlands (Qld) |
| Robertson | 32467 | Robertson (Qld) |
| Rocklea | 32474 | Rocklea (Qld) |
| Salisbury | 32527 | Salisbury (Qld) |
| Sandgate | 32532 | Sandgate (Qld) |
| Seven Hills | 32562 | Seven Hills (Qld) |
| Sherwood | 32579 | Sherwood (Qld) |
| Spring Hill | 32648 | Spring Hill (Qld) |
| Tennyson | 32798 | Tennyson (Qld) |
| The Gap | 32817 | The Gap (Brisbane - Qld) |
| Virginia | 32981 | Virginia (Qld) |
| West End | 33063 | West End (Brisbane - Qld) |
| Windsor | 33126 | Windsor (Qld) |
| Wishart | 33132 | Wishart (Qld) |
Stones Corner doesn’t exist in ABS but it does in BCC. It seems it’s part of Greenslopes.
SUB %>%
left_join(SSC) %>%
select(-geometry) %>%
filter(is.na(SSC_CODE16))## # A tibble: 1 x 3
## SSC_NAME16 SSC_CODE16 SSC_NAME16_orig
## <chr> <int> <chr>
## 1 Stones Corner NA <NA>
Full map
SSC %<>%
right_join(SUB) %>%
filter(SSC_NAME16 != "Stones Corner")
# SSC %>%
# plot(max.plot = 1)
qtm(SSC, fill = NULL, borders = "darkorchid4",
text ="SSC_NAME16", text.col = "darkorchid4")SEIFA <- read_xls("data-raw/SEIFA/2033055001 - ssc indexes.xls",
sheet = "Table 1", skip = 5, n_max = 13719, na = "-") %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
dplyr::rename(SSC_CODE16 = x1,
SSC_NAME16 = x2,
IRSD = score_3,
IRSD_d = decile_4,
IRSAD = score_5,
IRSAD_d = decile_6,
IER = score_7,
IER_d = decile_8,
IEO = score_9,
IEO_d = decile_10,
URP = x11,
caution = x12) %>%
mutate(SSC_CODE16 = as.integer(SSC_CODE16),
IRSD = as.integer(IRSD),
IRSAD = as.integer(IRSAD),
IER = as.integer(IER),
IEO = as.integer(IEO),
IRSD_d = as.integer(IRSD_d),
IRSAD_d = as.integer(IRSAD_d),
IER_d = as.integer(IER_d),
IEO_d = as.integer(IEO_d),
URP = as.integer(URP)
) %>%
mutate(caution = as.logical(ifelse(is.na(caution), "False", "True")))
write_rds(SEIFA, "data/SEIFA/SEIFA.Rds")SEIFA %<>%
select(-SSC_NAME16)
frq(SEIFA$IRSD_d)##
## x <integer>
## # total N=13713 valid N=13691 mean=5.50 sd=2.87
##
## Value | N | Raw % | Valid % | Cum. %
## ---------------------------------------
## 1 | 1369 | 9.98 | 10.00 | 10.00
## 2 | 1369 | 9.98 | 10.00 | 20.00
## 3 | 1370 | 9.99 | 10.01 | 30.01
## 4 | 1370 | 9.99 | 10.01 | 40.01
## 5 | 1367 | 9.97 | 9.98 | 50.00
## 6 | 1370 | 9.99 | 10.01 | 60.00
## 7 | 1369 | 9.98 | 10.00 | 70.00
## 8 | 1369 | 9.98 | 10.00 | 80.00
## 9 | 1371 | 10.00 | 10.01 | 90.02
## 10 | 1367 | 9.97 | 9.98 | 100.00
## <NA> | 22 | 0.16 | <NA> | <NA>
SEIFA %>%
ggplot(aes(x = IRSD_d)) +
geom_bar() SEIFA %>%
ggplot(aes(x = as.factor(IRSD_d), y = IRSD)) +
geom_boxplot(varwidth = TRUE) SSC %<>%
left_join(SEIFA)
# SSC %>%
# st_drop_geometry() %>%
# glimpse()
frq(SSC$IRSD_d)##
## x <integer>
## # total N=193 valid N=184 mean=7.79 sd=2.37
##
## Value | N | Raw % | Valid % | Cum. %
## -------------------------------------
## 1 | 3 | 1.55 | 1.63 | 1.63
## 2 | 6 | 3.11 | 3.26 | 4.89
## 3 | 5 | 2.59 | 2.72 | 7.61
## 4 | 5 | 2.59 | 2.72 | 10.33
## 5 | 13 | 6.74 | 7.07 | 17.39
## 6 | 12 | 6.22 | 6.52 | 23.91
## 7 | 28 | 14.51 | 15.22 | 39.13
## 8 | 18 | 9.33 | 9.78 | 48.91
## 9 | 33 | 17.10 | 17.93 | 66.85
## 10 | 61 | 31.61 | 33.15 | 100.00
## <NA> | 9 | 4.66 | <NA> | <NA>
SSC %>%
ggplot(aes(x = IRSD_d)) +
geom_bar() SSC %>%
filter(!is.na(IRSD_d)) %>%
ggplot(aes(x = as.factor(IRSD_d), y = IRSD)) +
geom_boxplot(varwidth = TRUE) Few areas with missing SEIFA
SSC %>%
st_drop_geometry() %>%
filter_at(vars(ends_with("_d")),
any_vars(is.na(.))) %>%
select(SSC_NAME16, ends_with("_d"), URP)## SSC_NAME16 IRSD_d IRSAD_d IER_d IEO_d URP
## 1 Banks Creek NA NA NA NA NA
## 2 Brisbane Airport NA NA NA NA NA
## 3 Eagle Farm NA NA NA NA NA
## 4 Enoggera Reservoir NA NA NA 10 25
## 5 Karawatha NA NA NA NA NA
## 6 Larapinta NA NA NA NA NA
## 7 Lytton NA NA NA NA NA
## 8 Mount Coot-Tha NA NA NA NA NA
## 9 Port Of Brisbane NA NA NA NA NA
These were excluded.
SSC %<>%
filter_at(vars(ends_with("_d")), all_vars(!is.na(.))) Few cases with ABS flag caution.
frq(SSC$caution)##
## x <lgl>
## # total N=184 valid N=184 mean=0.02 sd=0.15
##
## Value | N | Raw % | Valid % | Cum. %
## --------------------------------------
## FALSE | 180 | 97.83 | 97.83 | 97.83
## TRUE | 4 | 2.17 | 2.17 | 100.00
## <NA> | 0 | 0.00 | <NA> | <NA>
Usually with very small pop numbers.
SSC %>%
st_drop_geometry() %>%
group_by(caution) %>%
summarise(mean = mean(URP),
min = min(URP),
max = max(URP))## # A tibble: 2 x 4
## caution mean min max
## <lgl> <dbl> <int> <int>
## 1 FALSE 6295. 23 22904
## 2 TRUE 38.8 28 49
SSC %>%
st_drop_geometry() %>%
filter(caution) %>%
select(-caution, -SSC_CODE16, -SSC_NAME16_orig)## SSC_NAME16 IRSD IRSD_d IRSAD IRSAD_d IER IER_d IEO IEO_d URP
## 1 Bulwer 1014 6 996 6 1012 5 1024 7 49
## 2 Cowan Cowan 1014 6 996 6 1012 5 1024 7 28
## 3 England Creek 1004 5 978 5 1063 8 949 3 33
## 4 Kooringal 1014 6 996 6 1012 5 1024 7 45
These remain included.
Original values of indices were used to calculate ‘local deciles’ using SSCs for Brisbane only
SSC %<>%
mutate(IRSD_d_orig = IRSD_d) %>%
mutate(IRSD_d = ntile(IRSD, 10)) %>%
mutate(IRSAD_d_orig = IRSAD_d) %>%
mutate(IRSAD_d = ntile(IRSAD, 10)) %>%
mutate(IER_d_orig = IER_d) %>%
mutate(IER_d = ntile(IER, 10)) %>%
mutate(IEO_d_orig = IEO_d) %>%
mutate(IEO_d = ntile(IEO, 10))
write_rds(SSC, "data/geo/SSC.Rds")IRSD_d# frq(SSC$IRSD_d_orig)
# frq(SSC$IRSD_d)
SSC %>%
st_drop_geometry() %>%
tabyl(IRSD_d_orig, IRSD_d)## IRSD_d_orig 1 10 2 3 4 5 6 7 8 9
## 1 3 0 0 0 0 0 0 0 0 0
## 2 6 0 0 0 0 0 0 0 0 0
## 3 5 0 0 0 0 0 0 0 0 0
## 4 5 0 0 0 0 0 0 0 0 0
## 5 0 0 13 0 0 0 0 0 0 0
## 6 0 0 6 6 0 0 0 0 0 0
## 7 0 0 0 13 15 0 0 0 0 0
## 8 0 0 0 0 4 14 0 0 0 0
## 9 0 0 0 0 0 4 18 11 0 0
## 10 0 18 0 0 0 0 0 7 18 18
plot_xtab(SSC$IRSD_d, SSC$IRSD_d_orig,
margin = "row", bar.pos = "stack",
show.summary = TRUE, coord.flip = TRUE)tabz <- table(SSC$IRSD_d, SSC$IRSD_d_orig)
assocplot(tabz,
xlab = "IRSD_d", ylab = "IRSD_d_orig")mosaicplot(tabz,
xlab = "IRSD_d", ylab = "IRSD_d_orig")tm_shape(SSC) +
tm_polygons(col = "IRSD_d", n = 10, palette = "div",
id = "SSC_NAME16",
popup.vars = c("SSC_NAME16", "IRSD_d", "IRSD"))IRSAD_d# frq(SSC$IRSAD_d_orig)
# frq(SSC$IRSAD_d)
SSC %>%
st_drop_geometry() %>%
tabyl(IRSAD_d_orig, IRSAD_d)## IRSAD_d_orig 1 10 2 3 4 5 6 7 8 9
## 1 3 0 0 0 0 0 0 0 0 0
## 2 4 0 0 0 0 0 0 0 0 0
## 3 4 0 0 0 0 0 0 0 0 0
## 4 2 0 0 0 0 0 0 0 0 0
## 5 6 0 2 0 0 0 0 0 0 0
## 6 0 0 11 0 0 0 0 0 0 0
## 7 0 0 6 5 0 0 0 0 0 0
## 8 0 0 0 14 7 0 0 0 0 0
## 9 0 0 0 0 12 18 5 0 0 0
## 10 0 18 0 0 0 0 13 18 18 18
plot_xtab(SSC$IRSAD_d, SSC$IRSAD_d_orig,
margin = "row", bar.pos = "stack",
show.summary = TRUE, coord.flip = TRUE)tabz <- table(SSC$IRSAD_d, SSC$IRSAD_d_orig)
assocplot(tabz,
xlab = "IRSAD_d", ylab = "IRSAD_d_orig")mosaicplot(tabz,
xlab = "IRSAD_d", ylab = "IRSAD_d_orig")tm_shape(SSC) +
tm_polygons(col = "IRSAD_d", n = 10, palette = "div",
id = "SSC_NAME16",
popup.vars = c("SSC_NAME16", "IRSAD_d", "IRSAD"))IER_d# frq(SSC$IER_d_orig)
# frq(SSC$IER_d)
SSC %>%
st_drop_geometry() %>%
tabyl(IER_d_orig, IER_d)## IER_d_orig 1 10 2 3 4 5 6 7 8 9
## 1 19 0 1 0 0 0 0 0 0 0
## 2 0 0 18 4 0 0 0 0 0 0
## 3 0 0 0 15 3 0 0 0 0 0
## 4 0 0 0 0 16 2 0 0 0 0
## 5 0 0 0 0 0 16 3 0 0 0
## 6 0 0 0 0 0 0 15 5 0 0
## 7 0 0 0 0 0 0 0 6 0 0
## 8 0 0 0 0 0 0 0 7 5 0
## 9 0 0 0 0 0 0 0 0 13 3
## 10 0 18 0 0 0 0 0 0 0 15
plot_xtab(SSC$IER_d, SSC$IER_d_orig,
margin = "row", bar.pos = "stack",
show.summary = TRUE, coord.flip = TRUE)tabz <- table(SSC$IER_d, SSC$IER_d_orig)
assocplot(tabz,
xlab = "IER_d", ylab = "IER_d_orig")mosaicplot(tabz,
xlab = "IER_d", ylab = "IER_d_orig")tm_shape(SSC) +
tm_polygons(col = "IER_d", n = 10, palette = "div",
id = "SSC_NAME16",
popup.vars = c("SSC_NAME16", "IER_d", "IER"))IEO_d# frq(SSC$IEO_d_orig)
# frq(SSC$IEO_d)
SSC %>%
st_drop_geometry() %>%
tabyl(IEO_d_orig, IEO_d)## IEO_d_orig 1 10 2 3 4 5 6 7 8 9
## 1 4 0 0 0 0 0 0 0 0 0
## 2 3 0 0 0 0 0 0 0 0 0
## 3 3 0 0 0 0 0 0 0 0 0
## 4 9 0 0 0 0 0 0 0 0 0
## 5 0 0 5 0 0 0 0 0 0 0
## 6 0 0 4 0 0 0 0 0 0 0
## 7 0 0 10 7 0 0 0 0 0 0
## 8 0 0 0 12 7 0 0 0 0 0
## 9 0 0 0 0 12 18 3 0 0 0
## 10 0 18 0 0 0 0 15 18 18 18
plot_xtab(SSC$IEO_d, SSC$IEO_d_orig,
margin = "row", bar.pos = "stack",
show.summary = TRUE, coord.flip = TRUE)tabz <- table(SSC$IEO_d, SSC$IEO_d_orig)
assocplot(tabz,
xlab = "IEO_d", ylab = "IEO_d_orig")mosaicplot(tabz,
xlab = "IEO_d", ylab = "IEO_d_orig")tm_shape(SSC) +
tm_polygons(col = "IEO_d", n = 10, palette = "div",
id = "SSC_NAME16",
popup.vars = c("SSC_NAME16", "IEO_d", "IEO"))dog_cost <- read_xlsx("data-raw/costs/dog_expensive.xlsx") %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
select(-web_source) %>%
rename(dog_breed = breed) %>%
select(dog_breed) %>%
distinct() %>%
# correcting names for better matching - these one used in BCE
mutate(
dog_breed = case_when(
dog_breed == "Hairless Chinese Crested" ~ "Chinese Crested Dog",
dog_breed == "Saint Bernard" ~ "St Bernard",
TRUE ~ as.character(dog_breed))
) %>%
# synonyms
add_row(dog_breed = "Dogue de Bordeaux") %>%
add_row(dog_breed = "Bulldog") %>%
add_row(dog_breed = "British Bulldog") %>%
mutate(expensive = "yes") %>%
arrange(dog_breed)
write_rds(dog_cost, "data/costs/dog_cost.Rds")Top 20 most expensive dogs (+3 synonyms!)
dog_cost %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| dog_breed | expensive |
|---|---|
| Akita | yes |
| Azawakh | yes |
| Black Russian Terrier | yes |
| British Bulldog | yes |
| Bulldog | yes |
| Canadian Eskimo Dog | yes |
| Chinese Crested Dog | yes |
| Chow Chow | yes |
| Dogue de Bordeaux | yes |
| English Bulldog | yes |
| French Bulldog | yes |
| French Mastiff | yes |
| German Shepherd | yes |
| Irish Wolfhound | yes |
| Lowchen | yes |
| Maltese | yes |
| Pharaoh Hound | yes |
| Rottweiler | yes |
| Saluki | yes |
| Samoyed | yes |
| St Bernard | yes |
| Tibetan Mastiff | yes |
| Yorkshire Terrier | yes |
Data scraped from https://top10petinsurance.com.au/pet-insurance-prices on the 30th March 2020
# packages needed
# install.packages("rvest")
library(rvest)
library(tidyverse)
# scraping table done with this using chrome: https://www.r-bloggers.com/using-rvest-to-scrape-an-html-table/
url <- 'https://top10petinsurance.com.au/pet-insurance-prices/'
dog_insurance <- url %>%
xml2::read_html() %>%
html_nodes(xpath='//*[@id="post-1016"]/div/table') %>%
html_table()
dog_insurance <- dog_insurance[[1]]
head(dog_insurance)
write_rds(dog_insurance, "data-raw/costs/dog_insurance.Rds") # extracted on the 30 March 2020dog_insurance <- read_rds("data-raw/costs/dog_insurance.Rds") %>%
as_tibble() %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
# select(-) %>%
mutate(average_accident_policy_cost_annual =
gsub(",", "",
average_accident_policy_cost_annual,
fixed = TRUE),
average_illness_policy_cost_annual =
gsub(",", "",
average_illness_policy_cost_annual,
fixed = TRUE),
average_comprehensive_policy_cost_annual =
gsub(",", "",
average_comprehensive_policy_cost_annual,
fixed = TRUE)
) %>%
mutate(average_accident_policy_cost_annual =
as.numeric(gsub("$", "",
average_accident_policy_cost_annual,
fixed = TRUE)),
average_illness_policy_cost_annual =
as.numeric(gsub("$", "",
average_illness_policy_cost_annual,
fixed = TRUE)),
average_comprehensive_policy_cost_annual =
as.numeric(gsub("$", "",
average_comprehensive_policy_cost_annual,
fixed = TRUE))
)
write_rds(dog_insurance, "data/costs/dog_insurance.Rds") dog_insurance %>%
frq(cost_compared_to_other_breeds) %>%
kableExtra::kable()
|
Breeds in Above average and Significantly above average categories:
dog_insurance %>%
select(dog_breed, average_comprehensive_policy_cost_annual, cost_compared_to_other_breeds) %>%
filter(cost_compared_to_other_breeds != "Below average") %>%
arrange(desc(average_comprehensive_policy_cost_annual)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| dog_breed | average_comprehensive_policy_cost_annual | cost_compared_to_other_breeds |
|---|---|---|
| Grand Basset Griffon Vendeen | 1230 | Significantly above average |
| Hygen Hound Cross | 1221 | Significantly above average |
| Formosan Mountain Dog | 1221 | Significantly above average |
| Italian Cane Corso | 1221 | Significantly above average |
| Australian Bulldog Miniature | 1190 | Significantly above average |
| Australian Bulldog Miniature Cross | 1190 | Significantly above average |
| Drever | 1190 | Significantly above average |
| French Bulldog | 1189 | Significantly above average |
| Yorkshire Terrier | 1189 | Significantly above average |
| Airedale Terrier | 1189 | Significantly above average |
| Basset Hound | 1189 | Significantly above average |
| Bullmastiff | 1189 | Significantly above average |
| Great Dane | 1189 | Significantly above average |
| Rottweiler | 1189 | Significantly above average |
| Weimaraner | 1189 | Significantly above average |
| Alaskan Malamute | 1189 | Significantly above average |
| American Akita | 1189 | Significantly above average |
| Australian Bulldog | 1189 | Significantly above average |
| Bedlington Terrier | 1189 | Significantly above average |
| Boxer | 1189 | Significantly above average |
| British Bulldog | 1189 | Significantly above average |
| Dogue De Bordeaux | 1189 | Significantly above average |
| Drever Cross | 1189 | Significantly above average |
| Hygen Hound | 1189 | Significantly above average |
| Irish Setter | 1189 | Significantly above average |
| Irish Wolfhound | 1189 | Significantly above average |
| Mastiff | 1189 | Significantly above average |
| Miniature Doberman | 1189 | Significantly above average |
| Miniature Pinscher | 1189 | Significantly above average |
| Pekingese | 1189 | Significantly above average |
| Poodle – Standard | 1189 | Significantly above average |
| Shar-Pei | 1189 | Significantly above average |
| St Bernard | 1189 | Significantly above average |
| Welsh Corgi – Cardigan | 1189 | Significantly above average |
| Welsh Corgi – Pembroke | 1189 | Significantly above average |
| Wire-Haired Terrier | 1189 | Significantly above average |
| Bernese Mountain Dog | 1189 | Significantly above average |
| Unknown Dog Breed | 1168 | Significantly above average |
| Akita Inu | 1162 | Significantly above average |
| Alaskan Klee Kai | 1155 | Significantly above average |
| Neopolitan Mastiff | 1135 | Significantly above average |
| Newfoundland | 1135 | Significantly above average |
| Bull Terrier | 1110 | Significantly above average |
| Corgi | 945 | Above average |
| Alaskan Malamute Cross | 913 | Above average |
| Irish Wolfhound Cross | 913 | Above average |
| Rottweiler Cross | 913 | Above average |
| Polish Lowland Sheepdog Cross | 903 | Above average |
| Corgi Cross | 900 | Above average |
| Welsh Corgi – Pembroke Cross | 885 | Above average |
| British Bulldog Cross | 885 | Above average |
| Weimaraner Cross | 885 | Above average |
| Miniature Pinscher Cross | 885 | Above average |
| Akita Inu Cross | 885 | Above average |
| American Bulldog | 885 | Above average |
| Australian Bulldog Cross | 885 | Above average |
| Bedlington Terrier Cross | 885 | Above average |
| Bernese Mountain Dog Cross | 885 | Above average |
| Boxer Cross | 885 | Above average |
| Chow Chow | 885 | Above average |
| Dobermann | 885 | Above average |
| Labrador Retriever | 885 | Above average |
| Mastiff Cross | 885 | Above average |
| Newfoundland Cross | 885 | Above average |
| Pekingese Cross | 885 | Above average |
| Poodle – Standard Cross | 885 | Above average |
| St Bernard Cross | 885 | Above average |
| Welsh Corgi – Cardigan Cross | 885 | Above average |
| Airedale Terrier Cross | 885 | Above average |
| American Akita Cross | 885 | Above average |
| American Cocker Spaniel | 885 | Above average |
| Basset Hound Cross | 885 | Above average |
| Bullmastiff Cross | 885 | Above average |
| Dachshund Cross | 885 | Above average |
| Doberman Pinscher | 885 | Above average |
| Dogue De Bordeaux Cross | 885 | Above average |
| English Pointer | 885 | Above average |
| French Bulldog Cross | 885 | Above average |
| German Short Haired Pointer Cross | 885 | Above average |
| Great Dane Cross | 885 | Above average |
| Irish Setter Cross | 885 | Above average |
| Labrador | 885 | Above average |
| Miniature Doberman Cross | 885 | Above average |
| Miniature Poodle | 885 | Above average |
| Old English Sheepdog | 885 | Above average |
| Papillon Cross | 885 | Above average |
| Polish Lowland Sheepdog | 885 | Above average |
| Rhodesian Ridgeback Cross | 885 | Above average |
| Samoyed | 885 | Above average |
| Shar-Pei Cross | 885 | Above average |
| Neopolitan Mastiff Cross | 885 | Above average |
| Wire-Haired Terrier Cross | 885 | Above average |
| Wolfhound Cross | 885 | Above average |
| Yorkshire Terrier Cross | 885 | Above average |
| Dalmatian | 885 | Above average |
| Bull Terrier Cross | 869 | Above average |
| King Charles Spaniel | 869 | Above average |
| Rhodesian Ridgeback | 858 | Above average |
| Wolfhound | 846 | Above average |
| Pinscher | 836 | Above average |
| Pinscher Cross | 808 | Above average |
dog_ownership <- read_csv("data-raw/permits/cars-bis-open-data-animal-permits-3-jan-2019.zip")Raw dataset consists of 107,405 records.
Excluding records with permit_name: Breeders Permit, Cattery Permit, Racehorses Permit, Pet Shop Permit, Domestic Dog Permit & Guard Dog Permit.
Excluding records without neighbourhood.
Excluding records without dog_breed values.
Excluding records with dog_breed listed as Unknown or Cross.
Stones Corner values were assigned to Greenslopes SSC (see information in section above).
dog_ownership %<>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
# all the same here
select(-permit_group, -permit_status) %>%
# special permits?
filter(!permit_name %in% c("Breeders Permit", "Cattery Permit", "Racehorses Permit", "Pet Shop Permit")) %>%
filter(!permit_name %in% c("Domestic Dog Permit", "Guard Dog Permit")) %>%
rename(dog_breed = animal_breed,
SSC_NAME16 = application_location_suburb) %>%
mutate(SSC_NAME16 = str_to_title(SSC_NAME16)) %>%
# correct suburb
mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "Stones Corner", "Greenslopes", SSC_NAME16)) %>%
# missing geo
filter(!is.na(SSC_NAME16)) %>%
# missing breed
filter(!is.na(dog_breed)) %>%
filter(!dog_breed %in% c("Unknown",
"Medium Cross Breed", "Small Cross Breed", "Large Cross Breed")) %>%
# few cleans for better matches
mutate(
dog_breed = case_when(
dog_breed == "German Shepherd Dog (Long Stock Coat)" ~ "German Shepherd",
dog_breed == "German Shepherd Dog" ~ "German Shepherd",
dog_breed == "Central Asian Shepherd Dog " ~ "Central Asian Shepherd",
dog_breed == "Kangal Dog" ~ "Kangal",
dog_breed == "Bulldog" ~ "British Bulldog",
dog_breed == "Collie (Rough)" ~ "Rough Collie",
dog_breed == "Collie (Smooth)" ~ "Smooth Collie",
TRUE ~ as.character(dog_breed))
)
write_rds(dog_ownership, "data/permits/dog_ownership.Rds") 106,018 records after exclusions.
dog_costdog_ownership_cost <- left_join(dog_ownership, dog_cost, by = "dog_breed")
# binary indicator "expensive" and "non-expensive" dog breeds according to dog_cost
dog_ownership_cost %<>%
mutate(
expensive = case_when(
is.na(expensive) ~ 0,
expensive == "yes" ~ 1
)
)
frq(dog_ownership_cost, expensive)##
## expensive <numeric>
## # total N=106018 valid N=106018 mean=0.14 sd=0.35
##
## Value | N | Raw % | Valid % | Cum. %
## ----------------------------------------
## 0 | 90940 | 85.78 | 85.78 | 85.78
## 1 | 15078 | 14.22 | 14.22 | 100.00
## <NA> | 0 | 0.00 | <NA> | <NA>
dog_insuranceCorrecting names for better match
dog_insurance %<>%
mutate(
dog_breed = case_when(
# this is a bit tricky! might need sensitivity?
dog_breed == "French Poodle" ~ "Poodle",
# just naming issues
dog_breed == "Poodle – Standard" ~ "Poodle (Standard)",
dog_breed == "Miniature Poodle" ~ "Poodle (Miniature)",
dog_breed == "Poodle – Toy" ~ "Poodle (Toy)",
dog_breed == "Shar-Pei" ~ "Shar Pei",
dog_breed == "German Short Haired Pointer" ~ "German Shorthaired Pointer",
dog_breed == "German Wire Haired Pointer" ~ "German Wirehaired Pointer",
dog_breed == "Collie – Rough" ~ "Rough Collie",
dog_breed == "Collie – Smooth" ~ "Smooth Collie",
dog_breed == "Miniature Schnauzer" ~ "Schnauzer (Miniature)",
dog_breed == "Schnauzer Giant" ~ "Schnauzer (Giant)",
dog_breed == "Lagotto Rom" ~ "Lagotto Romagnolo",
dog_breed == "Brittany Spaniel" ~ "Brittany",
dog_breed == "Staghound" ~ "Stag Hound",
dog_breed == "Kerry Blue" ~ "Kerry Blue Terrier",
dog_breed == "English Toy terrier" ~ "English Toy Terrier",
dog_breed == "Parson Jack Russell Terrier" ~ "Parson Russell Terrier",
dog_breed == "Welsh Corgi – Pembroke" ~ "Welsh Corgi (Pembroke)",
dog_breed == "American Cocker Spaniel" ~ "Cocker Spaniel (American)",
dog_breed == "Basset Fauve De Bretagne" ~ "Basset Fauve de Bretagne",
dog_breed == "Norwegian Elk Hound" ~ "Norwegian Elkhound",
dog_breed == "Cheasapeake Bay Retriever" ~ "Chesapeake Bay Retriever",
dog_breed == "Bouvier Des Flandres" ~ "Bouvier des Flandres",
dog_breed == "Miniature Bull Terrier" ~ "Bull Terrier (Miniature)",
dog_breed == "Munsterlander – Large" ~ "Large Munsterlander",
dog_breed == "Welsh Corgi – Cardigan" ~ "Welsh Corgi (Cardigan)",
dog_breed == "HamiltonStovare" ~ "Hamiltonstovare",
dog_breed == "Blue tick Coonhound" ~ "Bluetick Coonhound",
dog_breed == "Japanese Akita" ~ "Akita (Japanese)",
dog_breed == "Dogue De Bordeaux" ~ "Dogue de Bordeaux",
dog_breed == "Italian Cane Corso" ~ "Italian Corso Dog",
# different kelpies but same category anyway
dog_breed == "Australian Kelpie Sheepdog" ~ "Australian Kelpie",
# typo
dog_breed == "Neopolitan Mastiff" ~ "Neapolitan Mastiff",
TRUE ~ as.character(dog_breed)))Dog breeds without match
dog_ownership_cost %>% anti_join(
dog_insurance %>% select(dog_breed, cost_compared_to_other_breeds)
) %>%
group_by(dog_breed) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| dog_breed | n |
|---|---|
| Fox Terrier | 3302 |
| Schnauzer | 1591 |
| Welsh Corgi | 376 |
| Pointer | 128 |
| Akita | 55 |
| Chihuahua (Smooth Coat) | 33 |
| Chihuahua (Long Coat) | 30 |
| Fox Terrier (Smooth) | 20 |
| Australian Stumpy Tail Cattle Dog | 17 |
| Foxhound | 13 |
| White Swiss Shepherd Dog | 7 |
| Fox Terrier (Wire) | 6 |
| Glen of Imaal Terrier | 4 |
| Canadian Eskimo Dog | 3 |
| Canaan Dog | 2 |
| Central Asian Shepherd Dog | 2 |
| Australian Staghound | 1 |
| Eurasier | 1 |
| Portuguese Podengo | 1 |
| Swedish Lapphund | 1 |
Some further corrections still possible here:
dog_ownership_cost <- left_join(dog_ownership_cost,
dog_insurance %>%
select(dog_breed, cost_compared_to_other_breeds)
) %>%
mutate(
cost_compared_to_other_breeds = case_when(
# all the same
dog_breed == "Fox Terrier" ~ "Below average",
dog_breed == "Fox Terrier (Smooth)" ~ "Below average",
dog_breed == "Fox Terrier (Wire)" ~ "Below average",
dog_breed == "Schnauzer" ~ "Below average",
dog_breed == "Australian Stumpy Tail Cattle Dog" ~ "Below average",
dog_breed == "Foxhound" ~ "Below average",
dog_breed == "White Swiss Shepherd Dog" ~ "Below average",
# multiple options here, but all above going for conservative
dog_breed == "Welsh Corgi" ~ "Above average",
# taking values from Chihuahua
dog_breed == "Chihuahua (Smooth Coat)" ~ "Below average",
dog_breed == "Chihuahua (Long Coat)" ~ "Below average",
dog_breed == "Chihuahua (Long Coat)" ~ "Below average",
# naming - Dogue De Bordeaux
dog_breed == "French Mastiff" ~ "Significantly above average",
TRUE ~ as.character(cost_compared_to_other_breeds)))Few things left:
dog_ownership_cost %>%
filter(is.na(cost_compared_to_other_breeds)) %>%
group_by(dog_breed) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"))| dog_breed | n |
|---|---|
| Pointer | 128 |
| Akita | 55 |
| Glen of Imaal Terrier | 4 |
| Canadian Eskimo Dog | 3 |
| Canaan Dog | 2 |
| Central Asian Shepherd Dog | 2 |
| Australian Staghound | 1 |
| Eurasier | 1 |
| Portuguese Podengo | 1 |
| Swedish Lapphund | 1 |
Two largest groups:
Pointer - insufficient info! Akita - insufficient info! Could be Inu, could be Japanese
These observations will remain as NAs.
write_rds(dog_ownership_cost, "data/permits/dog_ownership_cost.Rds")